# !pip install --upgrade pip
# !pip install pandas-profiling
# !pip install Folium
# !pip install scikit-learn
# !pip install xgboost
# !pip install xgboost
For this question, we will use the BX_18v1.csv PLUTO file located in the input folder.
# Import required libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as d
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import pandas_profiling
from IPython.display import display
%matplotlib inline
pd.options.display.float_format = '{:20,.3f}'.format
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 1000
np.set_printoptions(precision=3)
# Load the file in a dataframe
# As suggested in the "Ingest the NYC 311 Dataset" page : Parse the date fields by using the parse_dates option
df_heating_bronx = pd.read_csv('./input/NYC_311_Dataset.csv', parse_dates = ['created_date', 'closed_date'])
# merge 'HEAT/HOT WATER' & 'HEATING' type complaints
df_heating_bronx['complaint_type'].replace(
to_replace=['HEAT/HOT WATER', 'HEATING'],
value='HEATING/HOT WATER',
inplace=True
)
# Keep only these 2 types
df_heating_bronx = df_heating_bronx[df_heating_bronx['complaint_type'] == 'HEATING/HOT WATER']
# Keep only the BRONX borough
df_heating_bronx = df_heating_bronx[df_heating_bronx['borough'] == 'BRONX']
# Convert ZIP as INT and process_days columns
import datetime as dt
today=dt.datetime.now() #.date()
df_heating_bronx['incident_zip'] = df_heating_bronx.incident_zip.fillna('00000').astype(int)
df_heating_bronx['process_days'] = (df_heating_bronx['closed_date'].fillna(today) - df_heating_bronx['created_date']).dt.days.astype(int)
df_heating_bronx.head(2)
# Get the dataframe size
df_heating_bronx.shape
# Count NaN values in each column
df_NaN_count = df_heating_bronx.isnull().sum(axis = 0)
df_NaN_count = df_NaN_count.to_frame()
df_NaN_count.rename(columns = {list(df_NaN_count)[0]:'Count'}, inplace=True)
df_NaN_count['NaN %'] = df_NaN_count['Count'] / len(df_heating_bronx)
df_NaN_count['NaN %'] = df_NaN_count['NaN %'].map(lambda x: "{0:.2f}%".format(x*100))
df_NaN_count
We will use incident_address column value to merge with the PLUTO dataframe.
Remove record with a NaN value in this column.
df_heating_bronx.dropna(subset=['incident_address'], inplace=True)
df_heating_bronx.shape
df_heating_bronx['incident_address'].nunique()
len(df_heating_bronx['incident_address'])
We have 22836 distinct incident addresses, and a total of 604847 complaints.
To add the number of compalinst to a specific PLUTO address, we compute the number of complaints by incident address.
df_complaints_count = df_heating_bronx.groupby(['incident_address']).size()
df_complaints_count = df_complaints_count.reset_index()
df_complaints_count.columns.values[1] = "Complaints"
df_complaints_count.head(10)
df_complaints_count.sort_values(by=['Complaints'],ascending=False).head(10)
df_complaints_count.nunique()
# Limit the Pluto database to the BRONX file
df_Bronx= pd.read_csv('./input/BX_18v1.csv')
# Limit the fields to the following:
# Address, BldgArea, BldgDepth, BuiltFAR,
# CommFAR, FacilFAR, Lot, LotArea, LotDepth, NumBldgs,
# NumFloors, OfficeArea, ResArea, ResidFAR, RetailArea,
# YearBuilt, YearAlter1, ZipCode, YCoord, and XCoord.
df_Bronx = df_Bronx[['Address', 'BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR',
'FacilFAR', 'Lot', 'LotArea', 'LotDepth', 'NumBldgs', 'NumFloors',
'OfficeArea', 'ResArea', 'ResidFAR', 'RetailArea',
'YearBuilt', 'YearAlter1', 'ZipCode', 'YCoord', 'XCoord']]
# df_Bronx = df_Bronx.dropna()
df_Bronx.shape
df_Bronx.head(5)
Convert the ZIP code as string.
# df_Bronx['ZipCode'] = df_Bronx['ZipCode'].astype(int).astype(str)
We have the 'YearBuilt' column. We should be able to compute the building Age column.
df_Bronx['YearBuilt'].unique()
We have some 0 in the column.
(df_Bronx['YearBuilt'] == 0).sum()
We will replace this value by the current year to compute the age column.
#df_Bronx = df_Bronx[df_Bronx['YearBuilt'] != 0]
current_year = int(d.datetime.now().year)
df_Bronx['YearBuilt'].replace(
to_replace=[0],
value=current_year,
inplace=True
)
# Compute the Age Column
df_Bronx['Age'] = current_year - df_Bronx['YearBuilt']
df_Bronx.head(5)
# We can drop the YearBuilt column
df_Bronx.drop(['YearBuilt'], axis=1, inplace=True)
df_Bronx.head(5)
df_Bronx.shape
df_Bronx = pd.merge(df_Bronx,
df_complaints_count,
left_on="Address",
right_on="incident_address",
how="left")
df_Bronx.head(5)
df_Bronx.shape
print('We matched',df_Bronx['incident_address'].count(),
'of the', df_complaints_count['incident_address'].count(), 'incident adresses.')
# replace NaN Complaints value by 0
df_Bronx['Complaints'] = df_Bronx['Complaints'].fillna(0)
df_Bronx['HasComplaint'] = np.where(df_Bronx['Complaints']==0, 0, 1)
# remove the incident_address
df_Bronx.drop(['incident_address'], axis=1, inplace=True)
We have 2 different targets depending if we wnat the model the number of complaints OR if there has been a complaint.
prof = pandas_profiling.ProfileReport(df_Bronx)
#prof
prof.to_notebook_iframe()
# using Pearson correlation and sns.heatmap
cor = df_Bronx.corr(method='pearson')
#draw a heatmap
plt.figure(figsize=(16,12))
sns.heatmap(cor, annot = True, cmap='coolwarm', square=True,vmax=1, vmin=-1)
#Correct a seaborn bug
# This was a matplotlib regression introduced in 3.1.1 which has been fixed in 3.1.2 (still forthcoming).
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t)
# show
plt.show()
#Correlation with output variable
cor_complaints = abs(cor["Complaints"])
#Selecting top 5 correlated features.
corr_features = cor_complaints.sort_values(ascending=False).drop(labels=['Complaints', 'HasComplaint']).head(10)
corr_features
corr_features.plot(kind='barh', figsize=(10, 6),fontsize=12)
plt.xlabel('Correlation',fontsize=14)
plt.title('"Complaints" target (Nb of complaints) - Pearson Correlation', fontsize=14)
plt.gca().invert_yaxis()
[plt.text(v, i, '{0:.2f}'.format(v)) for i, v in enumerate(corr_features)];
plt.show()
#Correlation with 2nd variable
cor_hascomplaint = abs(cor["HasComplaint"])
#Selecting top 5 correlated features.
corr_features_hascomplaint = cor_hascomplaint.sort_values(ascending=False).drop(labels=['Complaints', 'HasComplaint']).head(10)
corr_features_hascomplaint
corr_features_hascomplaint.plot(kind='barh', figsize=(10, 6),fontsize=12)
plt.xlabel('Correlation',fontsize=14)
plt.title('"HasComplaint" target - Pearson Correlation', fontsize=14)
plt.gca().invert_yaxis()
[plt.text(v, i, '{0:.2f}'.format(v)) for i, v in enumerate(corr_features_hascomplaint)];
plt.show()
# Filter rows with missing values
df_Bronx = df_Bronx.dropna(axis=0)
features = ['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR',
'FacilFAR', 'Lot', 'LotArea', 'LotDepth', 'NumBldgs', 'NumFloors',
'OfficeArea', 'ResArea', 'ResidFAR', 'RetailArea',
'Age', 'YearAlter1', 'ZipCode', 'YCoord', 'XCoord']
# Choose target and features
y = df_Bronx.Complaints
# features
X = df_Bronx[features]
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
# use SelectFromModel
# use SelectFromModel to get the list of features
# sel = SelectFromModel( RandomForestRegressor(random_state=1) )
# sel.fit(train_X, train_y)
# selected_feat= train_X.columns[(sel.get_support())]
# len(selected_feat)
# Print the names of the most important features
# for feature_list_index in sel.get_support(indices=True):
# print(features[feature_list_index])
# use the RandomForestRegressor model
forest_model = RandomForestRegressor(n_estimators=100,
random_state=0)
forest_model.fit(train_X, train_y)
# get the feature importance list
feature_importances = pd.DataFrame(forest_model.feature_importances_,
index = train_X.columns,
columns=['importance']).sort_values('importance',
ascending=False)
feature_importances
# importances = forest_model.feature_importances_
# indices = np.argsort(importances)[::-1]
# for f in range(train_X.shape[1]):
# print("%d. %s %d (%f)" % (f + 1, features[indices[f]], indices[f], importances[indices[f]]))
# same start than the Random forest
features = ['BldgArea', 'BldgDepth', 'BuiltFAR', 'CommFAR',
'FacilFAR', 'Lot', 'LotArea', 'LotDepth', 'NumBldgs', 'NumFloors',
'OfficeArea', 'ResArea', 'ResidFAR', 'RetailArea',
'Age', 'YearAlter1', 'ZipCode', 'YCoord', 'XCoord']
# Choose target and features
y = df_Bronx.Complaints
# features
X = df_Bronx[features]
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
# use the XGBRegressor class
xgb_model = xgb.XGBRegressor(colsample_bytree=0.4,
gamma=0.045,
learning_rate=0.07,
max_depth=20,
min_child_weight=1.5,
n_estimators=300,
reg_alpha=0.65,
reg_lambda=0.45,
subsample=0.95)
xgb_model.fit(train_X, train_y)
# get the feature importance list
feature_importances = pd.DataFrame(xgb_model.feature_importances_,
index = train_X.columns,
columns=['importance']).sort_values('importance',
ascending=False)
feature_importances.head(10)
feature_importances.plot(kind='barh', figsize=(10, 6),fontsize=12)
plt.xlabel('Correlation',fontsize=14)
plt.title('"Complaints" target - XGBoost', fontsize=14)
plt.gca().invert_yaxis()
[plt.text(v, i, '{0:.2f}'.format(v)) for i, v in enumerate(feature_importances['importance'])];
plt.show()
Using the Pearson correlation we found that the **NumFloors**, **BuiltFAR**, **BldgDepth**, **ResidFAR**, **FacilFAR**, **XCoord** and **ResArea** features have a correlation with the **number of complaints**.
We find again the **NumFloors**, **ResArea**, **XCoord**, **BuiltFAR** in the XGBoost result and OfficeArea, LotArea.